Importing datasets

#day is a more comprehensive data set and can be used more inclusively vs hourly.

day <- read_csv("day.csv", show_col_types = FALSE)

#rename some variables
day <- rename(day, date = dteday, year = yr, month = mnth, weather_situation = weathersit, count = cnt, serial = instant)

day
## # A tibble: 731 × 16
##    serial date       season  year month holiday weekday workingday
##     <dbl> <date>      <dbl> <dbl> <dbl>   <dbl>   <dbl>      <dbl>
##  1      1 2011-01-01      1     0     1       0       6          0
##  2      2 2011-01-02      1     0     1       0       0          0
##  3      3 2011-01-03      1     0     1       0       1          1
##  4      4 2011-01-04      1     0     1       0       2          1
##  5      5 2011-01-05      1     0     1       0       3          1
##  6      6 2011-01-06      1     0     1       0       4          1
##  7      7 2011-01-07      1     0     1       0       5          1
##  8      8 2011-01-08      1     0     1       0       6          0
##  9      9 2011-01-09      1     0     1       0       0          0
## 10     10 2011-01-10      1     0     1       0       1          1
## # ℹ 721 more rows
## # ℹ 8 more variables: weather_situation <dbl>, temp <dbl>, atemp <dbl>,
## #   hum <dbl>, windspeed <dbl>, casual <dbl>, registered <dbl>, count <dbl>

This code chunk answers Further Researched Question #1:

Did any major weather events cause changes in the average bike rentals, and was it something that may affect our data/predictions.

#Changing levels for clarity
day$weather_situation <- factor(day$weather_situation, levels = c(1, 2, 3), labels = c("Clear", "Cloudy", "Rain"))

#Find average based on weather situation first
average_rentals_by_weather <- day %>%
  group_by(weather_situation) %>%
  summarize(average_rentals = mean(count, na.rm = TRUE))

print(average_rentals_by_weather)
## # A tibble: 3 × 2
##   weather_situation average_rentals
##   <fct>                       <dbl>
## 1 Clear                       4877.
## 2 Cloudy                      4036.
## 3 Rain                        1803.
ggplot(average_rentals_by_weather, aes(x = weather_situation, y = average_rentals)) +
  geom_bar(stat = "identity") +
  labs(x = "Weather Situation", y = "Average Bike Rentals", 
       title = "Average Bike Rentals by Weather Situation")

#Plotting bike rentals over time
ggplot(day, aes(x = date, y = count, color = weather_situation)) +
  geom_line() +
  labs(x = "Date", y = "Number of Bike Rentals",
       title = "Daily Bike Rentals Over Time by Weather Situation")

We look at the year 2011

#get the year 2011 values alone
day_2011 <- day %>% 
  filter(year(date) == 2011)

average_rentals_by_weather_2011 <- day_2011 %>%
  group_by(weather_situation) %>%
  summarize(average_rentals = mean(count, na.rm = TRUE))

print(average_rentals_by_weather_2011)
## # A tibble: 3 × 2
##   weather_situation average_rentals
##   <fct>                       <dbl>
## 1 Clear                       3695.
## 2 Cloudy                      3088.
## 3 Rain                        1674.
ggplot(average_rentals_by_weather_2011, aes(x = weather_situation, y = average_rentals)) +
  geom_bar(stat = "identity") +
  labs(x = "Weather Situation", y = "Average Bike Rentals", 
       title = "Average Bike Rentals by Weather Situation in 2011")

ggplot(day_2011, aes(x = date, y = count, color = weather_situation)) +
  geom_line() +
  labs(x = "Date", y = "Number of Bike Rentals",
       title = "Daily Bike Rentals Over Time by Weather Situation in 2011")

Match specific dates

# Create a data frame with specific dates and their corresponding weather event labels
weather_events <- data.frame(
  date = mdy(c("January 26, 2011", "September 4, 2011", "September 5, 2011", 
               "September 6, 2011", "September 7, 2011", "September 8, 2011", 
               "August 28, 2011", "April 27, 2011", "April 28, 2011")),
  event = c("Snowstorm", "Tropical Storm Lee", "Tropical Storm Lee", 
            "Tropical Storm Lee", "Tropical Storm Lee", "Tropical Storm Lee", 
            "Hurricane Irene", "Tornado Outbreak", "Tornado Outbreak")
)

# Merge the weather events with day_2011 to get the counts for those dates
highlighted_data <- left_join(day_2011, weather_events, by = "date")

# Filter out the rows where label is NA to use in geom_point and geom_text
filtered_highlighted_data <- highlighted_data %>% 
  filter(!is.na(event))

# Create the plot
p <- ggplot(day_2011, aes(x = date, y = count)) +
  geom_line(aes(color = weather_situation)) +  # Line for daily data
  geom_point(data = filtered_highlighted_data, aes(x = date, y = count, color = event), size = 2) +  # Points for events
  geom_text(data = filtered_highlighted_data, aes(x = date, y = count, label = event), 
            hjust = "inward", vjust = "inward", size = 3, check_overlap = TRUE, nudge_y = 200) +  # Labels for events
  labs(title = "Daily Bike Rentals and Weather Events in 2011", x = "Date", y = "Number of Bike Rentals") +
  scale_color_manual(values = c("good" = "green", "okay" = "orange", "bad" = "red", 
                                "Snowstorm" = "blue", "Tropical Storm Lee" = "purple", 
                                "Hurricane Irene" = "black", "Tornado Outbreak" = "brown")) +
  theme_minimal() +
  theme(plot.title = element_text(size = 20),
        axis.title = element_text(size = 14),
        axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(p)

# Calculate the monthly averages for 2011
monthly_avg_2011 <- day_2011 %>%
  group_by(month = floor_date(date, "month")) %>%
  summarize(average_count = mean(count))

# Define the specific weather events for 2011
weather_events_2011 <- data.frame(
  date = mdy(c("January 26, 2011", "September 4, 2011", "September 5, 2011", 
               "September 6, 2011", "September 7, 2011", "September 8, 2011", 
               "August 28, 2011", "April 27, 2011", "April 28, 2011")),
  event = c("Snowstorm", "Tropical Storm Lee", "Tropical Storm Lee", 
            "Tropical Storm Lee", "Tropical Storm Lee", "Tropical Storm Lee", 
            "Hurricane Irene", "Tornado Outbreak", "Tornado Outbreak")
)

# Merge weather events with day_2011 to get the count for those dates
weather_data_2011 <- inner_join(weather_events_2011, day_2011, by = "date")

# Plot the monthly average line for 2011
p1 <- ggplot(monthly_avg_2011, aes(x = month, y = average_count)) +
  geom_line(color = "blue", size = 1) +
  labs(title = "Monthly Average Bike Rentals with Weather Events in 2011",
       x = "Month", y = "Average Bike Rentals") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Overlay the actual counts for the weather event dates on the monthly average line for 2011
p1 <- p1 + geom_point(data = weather_data_2011, aes(x = date, y = count, color = event), size = 3) +
  scale_color_manual(values = c("Snowstorm" = "blue", "Tropical Storm Lee" = "purple",
                                "Hurricane Irene" = "red", "Tornado Outbreak" = "brown")) +
  theme(legend.position = "bottom")

# Print the plot for 2011
print(p1)

Looking at the year 2011 we can match exact dates to weather occurrences. On January 26, there is an all time low for the 26-27th for count of bike rentals, this is related to the rush hour snowstorm that occurs. There are 506 rentals and 431 respectively. In the month of July there was a heat wave and had the hottest days which explains why it has the month with highest rentals. Then in August there were hurricanes which impacted the rentals even though the weather was okay.

Now analyze the year 2012 alone to see if there were specific weather occurences

#get the year 2012 values alone
day_2012 <- day %>%
  filter(year(date) == 2012)

average_rentals_by_weather_2012 <- day_2012 %>%
  group_by(weather_situation) %>%
  summarize(average_rentals = mean(count, na.rm = TRUE))

print(average_rentals_by_weather_2012)
## # A tibble: 3 × 2
##   weather_situation average_rentals
##   <fct>                       <dbl>
## 1 Clear                       6004.
## 2 Cloudy                      4991.
## 3 Rain                        2126.
ggplot(average_rentals_by_weather_2012, aes(x = weather_situation, y = average_rentals)) +
  geom_bar(stat = "identity") +
  labs(x = "Weather Situation", y = "Average Bike Rentals", 
       title = "Average Bike Rentals by Weather Situation in 2012")

ggplot(day_2012, aes(x = date, y = count, color = weather_situation)) +
  geom_line() +
  labs(x = "Date", y = "Number of Bike Rentals",
       title = "Daily Bike Rentals Over Time by Weather Situation in 2012")

merge for 2012

# Specific dates and labels for weather events in 2012
weather_events_2012 <- data.frame(
  date = c(mdy("June 1, 2012"), mdy("June 29, 2012"), mdy("October 29, 2012"), mdy("October 30, 2012")),
  event = c("Tornado Outbreak", "Derecho", "Hurricane Sandy", "Hurricane Sandy")
)

# Merge the weather events with day_2012 to get the counts for those dates
highlighted_data_2012 <- left_join(day_2012, weather_events_2012, by = "date")

# Filter out the rows without events for use in geom_point
events_data_2012 <- highlighted_data_2012 %>% filter(!is.na(event))

# Create the plot for 2012
p_2012 <- ggplot(day_2012, aes(x = date, y = count, group = weather_situation, color = weather_situation)) +
  geom_line() +  # Line for daily data
  geom_point(data = events_data_2012, aes(x = date, y = count, color = event), size = 3) +  # Colored points for events
  scale_color_manual(values = c("good" = "green", "okay" = "orange", "bad" = "red", 
                                "Tornado Outbreak" = "blue", "Derecho" = "cyan", 
                                "Hurricane Sandy" = "chocolate")) +
  labs(title = "Daily Bike Rentals and Weather Events in 2012", x = "Date", y = "Number of Bike Rentals") +
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 20),
        axis.title = element_text(size = 14),
        axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(p_2012)

# Filter out the 2012 data
day_2012 <- day %>% 
  filter(year(date) == 2012)

# Define specific weather events for 2012
weather_events_2012 <- data.frame(
  date = mdy(c("June 1, 2012", "June 29, 2012", "October 29, 2012", "October 30, 2012")),
  event = c("Tornado Outbreak", "Derecho", "Hurricane Sandy", "Hurricane Sandy")
)

# Merge weather events with day_2012 to get the count for those dates
weather_data_2012 <- inner_join(weather_events_2012, day_2012, by = "date")

# Calculate the monthly averages for 2012
monthly_avg_2012 <- day_2012 %>%
  group_by(month = floor_date(date, "month")) %>%
  summarize(average_count = mean(count))

# Plot the monthly average line for 2012
p2 <- ggplot(monthly_avg_2012, aes(x = month, y = average_count)) +
  geom_line(color = "blue", size = 1) +
  labs(title = "Monthly Average Bike Rentals with Weather Events in 2012",
       x = "Month", y = "Average Bike Rentals") +
  theme_minimal()

# Overlay the actual counts for the weather event dates on the monthly average line for 2012
p2 <- p2 + geom_point(data = weather_data_2012, aes(x = date, y = count, color = event), size = 3) +
  scale_color_manual(values = c("Tornado Outbreak" = "blue", "Derecho" = "orange",
                                "Hurricane Sandy" = "red")) +
  theme(legend.position = "bottom")

# Print the plot for 2012
print(p2)

Calculate standard deviations for 2011 and 2012

# Calculate the mean count for each year's data
mean_count_2011 <- mean(day_2011$count, na.rm = TRUE)
mean_count_2012 <- mean(day_2012$count, na.rm = TRUE)

# Assuming 'weather_events_2011' and 'weather_events_2012' contain the specific event dates and names for each year
# Inner join is used to filter the day data to only include the dates where events occurred
day_events_2011 <- inner_join(day_2011, weather_events_2011, by = "date")
day_events_2012 <- inner_join(day_2012, weather_events_2012, by = "date")

# Calculate deviation for each specific event point from the mean for 2011
day_events_2011 <- day_events_2011 %>%
  mutate(deviation_2011 = count - mean_count_2011)

# Calculate deviation for each specific event point from the mean for 2012
day_events_2012 <- day_events_2012 %>%
  mutate(deviation_2012 = count - mean_count_2012)

# Select relevant columns for deviation tables
deviation_table_2011 <- day_events_2011 %>%
  select(date, event, count, deviation_2011)

deviation_table_2012 <- day_events_2012 %>%
  select(date, event, count, deviation_2012)

# Print the deviation tables for 2011 and 2012
print(deviation_table_2011)
## # A tibble: 9 × 4
##   date       event              count deviation_2011
##   <date>     <chr>              <dbl>          <dbl>
## 1 2011-01-26 Snowstorm            506        -2900. 
## 2 2011-04-27 Tornado Outbreak    3872          466. 
## 3 2011-04-28 Tornado Outbreak    4058          652. 
## 4 2011-08-28 Hurricane Irene     4334          928. 
## 5 2011-09-04 Tropical Storm Lee  4940         1534. 
## 6 2011-09-05 Tropical Storm Lee  3351          -54.8
## 7 2011-09-06 Tropical Storm Lee  2710         -696. 
## 8 2011-09-07 Tropical Storm Lee  1996        -1410. 
## 9 2011-09-08 Tropical Storm Lee  1842        -1564.
print(deviation_table_2012)
## # A tibble: 4 × 4
##   date       event            count deviation_2012
##   <date>     <chr>            <dbl>          <dbl>
## 1 2012-06-01 Tornado Outbreak  4127         -1473.
## 2 2012-06-29 Derecho           5463          -137.
## 3 2012-10-29 Hurricane Sandy     22         -5578.
## 4 2012-10-30 Hurricane Sandy   1096         -4504.

Upon analyzing the data in 2012, there are two low points that can be identified around June and October/November. When the bike shares were at all time lows at this point when the weather was bad, we searched for a reason for this. According to the top 5 weather incidents of 2012, there was hurricane Sandy in late October and also the June 29th Derecho where weather temperatures were at an all time high. The data points line up with the weather occurences and explain how weather can impact the number of bike rentals.

Modeling to predict rentals

library(randomForest)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(dplyr)
library(tidyr)

# Assuming `day` is your dataset and you have already preprocessed it

# Split data into training and testing sets
set.seed(123) # for reproducibility
index <- createDataPartition(day$count, p = 0.8, list = FALSE)
train_data <- day[index, ]
test_data <- day[-index, ]

# Random Forest model excluding 'registered' and 'casual'
rf_model <- randomForest(count ~ . -serial, data = train_data, mtry = 3, ntree = 500)

# Evaluate model performance
rf_predictions <- predict(rf_model, test_data)
rf_rmse <- RMSE(rf_predictions, test_data$count)
rf_mae <- MAE(rf_predictions, test_data$count)

print(rf_rmse)
## [1] 307.1235
print(rf_mae)
## [1] 214.6393
# Check variable importance
importance(rf_model)
##                   IncNodePurity
## date                  359404554
## season                 83637436
## year                  126188022
## month                  55405975
## holiday                 2532730
## weekday                17627038
## workingday             16130248
## weather_situation      23466703
## temp                  221132863
## atemp                 177856209
## hum                    42922837
## windspeed              24959007
## casual                332615536
## registered            664895321
# If you need to perform hyperparameter tuning
control <- trainControl(method = "cv", number = 10)
grid <- expand.grid(mtry = c(2, 3, 4))

tuned_rf <- train(count ~ . -serial, data = train_data, method = "rf",
                  tuneGrid = grid, trControl = control)
                  
# Check the results
print(tuned_rf)
## Random Forest 
## 
## 587 samples
##  15 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 528, 527, 529, 530, 529, 527, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##   2     415.7918  0.9625146  289.9533
##   3     332.7804  0.9751560  228.1642
##   4     288.2509  0.9811153  194.9500
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 4.

predict by splitting dates

library(caret)
library(dplyr)
library(lubridate)
library(ggplot2)

# Assuming `day` is your dataset and it has a `date` column.

# Convert date to Date class if not already
day$date <- as.Date(day$date)

# Sort the data by date just in case
day <- day %>% arrange(date)

# Split the data: for example, use the first 80% of the days for training, the rest for testing.
# Find the split point.
split_date <- day$date[round(nrow(day) * 0.8)]

# Create the training and testing datasets.
train_set <- day %>% filter(date <= split_date)
test_set <- day %>% filter(date > split_date)

# Now, train your model on the train_set
# Let's say you're using a random forest model from the 'randomForest' package.
library(randomForest)

# Specify the model formula: count is the response variable, and the rest are predictors.
# Exclude serial and date from the predictors as they are not informative for the model.
model_formula <- as.formula("count ~ . - serial - date")

# Train the model.
rf_model <- randomForest(model_formula, data=train_set)

# Make predictions on the test set.
predictions <- predict(rf_model, test_set)

# Evaluate the model's performance.
# Calculate RMSE and R^2 as an example.
rmse <- sqrt(mean((predictions - test_set$count)^2))
rsquared <- cor(predictions, test_set$count)^2

# Output the performance.
cat("RMSE on the test set:", rmse, "\n")
## RMSE on the test set: 654.5102
cat("R-squared on the test set:", rsquared, "\n")
## R-squared on the test set: 0.9396156

Plotting variable importance:

# Assuming 'rf_model' is the trained random forest model
var_imp <- importance(rf_model)

# Convert to data frame for plotting
var_imp_df <- data.frame(
  Variable = rownames(var_imp),
  Importance = var_imp[, "IncNodePurity"]
)

# Plot variable importance using ggplot2
ggplot(var_imp_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates to make the plot horizontal
  labs(x = "Variables", y = "Importance", title = "Variable Importance Plot") +
  theme_minimal()

Partial Dependence plots

library(randomForest)
library(ggplot2)
library(pdp) # for partial dependence plots
## 
## Attaching package: 'pdp'
## The following object is masked from 'package:purrr':
## 
##     partial
# Assuming 'rf_model' is your trained random forest model without the 'registered' and 'casual' variables
# and 'train_set' is your training dataset.

# List of weather-related variables to plot
weather_variables <- c("temp", "atemp", "hum", "windspeed", "weather_situation")

# Create empty list to store partial plots
partial_plots_list <- list()

# Loop over weather variables and create partial dependence plots
for (variable in weather_variables) {
  # Calculate partial dependence for the specific variable
  pd <- partial(rf_model, pred.var = variable, train = train_set, grid.resolution = 50)
  
  # Convert to data frame for ggplot
  pd_df <- as.data.frame(pd)
  
  # Create the plot
  p <- ggplot(pd_df, aes_string(x = names(pd_df)[2], y = names(pd_df)[1])) +
    geom_line() +
    labs(title = paste("Partial Dependence Plot for", variable),
         x = variable,
         y = "Partial Dependence") +
    theme_minimal()
  
  # Add the plot to the list
  partial_plots_list[[variable]] <- p
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the plots for weather-related variables
library(gridExtra)
grid.arrange(grobs = partial_plots_list, ncol = 2)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

# Load necessary libraries
library(tidyverse)
library(lubridate)
library(ggplot2)
library(randomForest)
library(caret)

# Load your dataset
day1 <- read_csv("day.csv")
## Rows: 731 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (15): instant, season, yr, mnth, holiday, weekday, workingday, weathers...
## date  (1): dteday
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Print the column names to ensure correct referencing
print(colnames(day1))
##  [1] "instant"    "dteday"     "season"     "yr"         "mnth"      
##  [6] "holiday"    "weekday"    "workingday" "weathersit" "temp"      
## [11] "atemp"      "hum"        "windspeed"  "casual"     "registered"
## [16] "cnt"
# Assuming day dataset has the following weather-related columns:
# temp (temperature), atemp (feeling temperature), hum (humidity), windspeed
# Weathersit column should be checked for correct name

# If 'weathersit' is named differently, replace 'weathersit' in the code below with the correct column name

# Convert weather situation into a factor with readable levels
day1$weathersit <- factor(day1$weathersit, levels = c(1, 2, 3), labels = c("Clear", "Mist", "Light Rain/Snow"))

# Create interaction terms
day1$interaction_temp_hum <- day1$temp * day1$hum
day1$interaction_temp_windspeed <- day1$temp * day1$windspeed

# Exploratory Data Analysis (EDA)
# Plotting relationships between weather variables and bike rentals
p1 <- ggplot(day1, aes(x = temp, y = cnt)) + geom_point() + geom_smooth(method = "lm") +
  labs(title = "Effect of Temperature on Bike Rentals", x = "Temperature", y = "Count of Rentals")
p2 <- ggplot(day1, aes(x = hum, y = cnt)) + geom_point() + geom_smooth(method = "lm") +
  labs(title = "Effect of Humidity on Bike Rentals", x = "Humidity", y = "Count of Rentals")
p3 <- ggplot(day1, aes(x = windspeed, y = cnt)) + geom_point() + geom_smooth(method = "lm") +
  labs(title = "Effect of Wind Speed on Bike Rentals", x = "Wind Speed", y = "Count of Rentals")
p4 <- ggplot(day1, aes(x = weathersit, y = cnt)) + geom_boxplot() +
  labs(title = "Effect of Weather Condition on Bike Rentals", x = "Weather Situation", y = "Count of Rentals")

# Display plots
grid.arrange(p1, p2, p3, p4, nrow = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Build a Random Forest Model to quantify impact
set.seed(123) # For reproducibility
trainIndex <- createDataPartition(day1$cnt, p = 0.8, list = FALSE)
trainData <- day1[trainIndex,]
testData <- day1[-trainIndex,]

model <- randomForest(cnt ~ temp + atemp + hum + windspeed + weathersit, data = trainData)
print(model)
## 
## Call:
##  randomForest(formula = cnt ~ temp + atemp + hum + windspeed +      weathersit, data = trainData) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 1655509
##                     % Var explained: 55.23
# Evaluate model performance
predictions <- predict(model, testData)
result <- data.frame(Actual = testData$cnt, Predicted = predictions)
cor(result$Actual, result$Predicted) # Checking correlation between actual and predicted values
## [1] 0.780705
# Variable importance
importance(model)
##            IncNodePurity
## temp           588674282
## atemp          556581016
## hum            221951153
## windspeed      193420231
## weathersit     101791654
# Further tuning and validation could be performed based on initial results
trainData <- trainData %>%
  mutate(interaction_temp_hum = temp * hum,
         interaction_temp_windspeed = temp * windspeed)

# Rebuild the model including interactions
model_with_interactions <- randomForest(
  cnt ~ temp + atemp + hum + windspeed + weathersit + interaction_temp_hum + interaction_temp_windspeed,
  data = trainData,
  mtry = 3,
  ntree = 500
)
print(model_with_interactions)
## 
## Call:
##  randomForest(formula = cnt ~ temp + atemp + hum + windspeed +      weathersit + interaction_temp_hum + interaction_temp_windspeed,      data = trainData, mtry = 3, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 1730912
##                     % Var explained: 53.19
# Grid Search for Random Forest hyperparameters
tune_grid <- expand.grid(
  mtry = c(2, 3, 4, 5)
)

control <- trainControl(method = "cv", number = 10)  # 10-fold cross-validation

tuned_model <- train(
  cnt ~ temp + atemp + hum + windspeed + weathersit + interaction_temp_hum + interaction_temp_windspeed,
  data = trainData,
  method = "rf",
  tuneGrid = tune_grid,
  trControl = control
)

# Display best model parameters and results
print(tuned_model)
## Random Forest 
## 
## 587 samples
##   7 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 528, 528, 528, 527, 528, 528, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##   2     1300.983  0.5454981  1088.665
##   3     1312.526  0.5390058  1095.222
##   4     1320.904  0.5340101  1096.204
##   5     1320.517  0.5339939  1095.934
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
print(tuned_model$bestTune)
##   mtry
## 1    2
# Assuming 'holiday' and 'weekday' are available in your dataset
model_extended <- randomForest(
  cnt ~ temp + atemp + hum + windspeed + weathersit + weekday + holiday,
  data = trainData,
  mtry = tuned_model$bestTune$mtry,
  ntree = 500
)
print(model_extended)
## 
## Call:
##  randomForest(formula = cnt ~ temp + atemp + hum + windspeed +      weathersit + weekday + holiday, data = trainData, mtry = tuned_model$bestTune$mtry,      ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 1636957
##                     % Var explained: 55.73
# Using GBM
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(123)
gbm_model <- gbm(
  formula = cnt ~ temp + atemp + hum + windspeed + weathersit,
  data = trainData,
  distribution = "gaussian",
  n.trees = 500,
  interaction.depth = 3,
  shrinkage = 0.01,
  cv.folds = 5,
  n.minobsinnode = 10
)
summary(gbm_model)

##                   var   rel.inf
## atemp           atemp 38.427339
## temp             temp 35.356696
## hum               hum 16.056909
## windspeed   windspeed  7.373430
## weathersit weathersit  2.785625
# Neural Network
library(nnet)
set.seed(123)
nn_model <- nnet(
  cnt ~ temp + atemp + hum + windspeed + weathersit,
  data = trainData,
  size = 5,
  linout = TRUE,
  decay = 1e-4,
  maxit = 500
)
## # weights:  41
## initial  value 14039737283.273609 
## iter  10 value 2171086214.385982
## iter  20 value 2170835933.702363
## iter  30 value 2122083152.691980
## iter  40 value 1384887323.458193
## iter  50 value 1075391180.695228
## iter  60 value 989619331.976347
## iter  70 value 982336795.945481
## iter  80 value 968902998.603212
## iter  90 value 968859239.241991
## iter 100 value 968842620.709985
## iter 100 value 968842619.722309
## final  value 968842366.859102 
## converged
print(nn_model)
## a 6-5-1 network with 41 weights
## inputs: temp atemp hum windspeed weathersitMist weathersitLight Rain/Snow 
## output(s): cnt 
## options were - linear output units  decay=1e-04

This code chunk answers Further Researched Question #2:

How was attendance of the National Cherry Blossom Festival in D.C. and did the yearly event cause a spike in casual rentals.

Importing cherry blossoms dataset

cherry_blossoms <- read_csv("cherry-blossoms_filtered_2011-2013.csv", show_col_types = FALSE)

#rename some variables
cherry_blossoms <- rename(cherry_blossoms, peak_bloom = "Yoshino Peak Bloom Date", festival_start_date = "Festival Start Date", festival_duration = "Festival Duration")

#selecting festival start date and duration 
cherry_blossoms <- select(cherry_blossoms, Year, festival_start_date, festival_duration)

#reformating festival start date and the end date
cherry_blossoms <- cherry_blossoms %>%
  mutate(
    Year = as.numeric(Year),
    festival_start_date = as.Date(festival_start_date - 1, origin=paste0(Year - 1, "-12-31")),
    festival_end_date = festival_start_date + as.numeric(festival_duration) - 1
  )

cherry_blossoms
## # A tibble: 3 × 4
##    Year festival_start_date festival_duration festival_end_date
##   <dbl> <date>                          <dbl> <date>           
## 1  2011 2011-03-25                         15 2011-04-08       
## 2  2012 2012-03-19                         38 2012-04-25       
## 3  2013 2013-03-19                         25 2013-04-12
day <- day %>%
  mutate(Year = year(date))

# Perform an inner_join to merge day and cherry_blossoms data
merged_data <- inner_join(cherry_blossoms, day, by = "Year")

# Filter for festival dates
festival_data <- merged_data %>%
  filter(date >= festival_start_date & date <= festival_end_date)

# Select needed variables for casual rentals
festival_casual_rentals <- festival_data %>%
  select(festival_start_date, festival_end_date, date, casual)

# Summarize casual bike rentals by festival
total_casual_rentals_by_festival <- festival_casual_rentals %>%
  group_by(festival_start_date, festival_end_date) %>%
  summarize(total_casual_rentals = sum(casual))
## `summarise()` has grouped output by 'festival_start_date'. You can override
## using the `.groups` argument.
# Output total casual rentals by festival
total_casual_rentals_by_festival
## # A tibble: 2 × 3
## # Groups:   festival_start_date [2]
##   festival_start_date festival_end_date total_casual_rentals
##   <date>              <date>                           <dbl>
## 1 2011-03-25          2011-04-08                        7552
## 2 2012-03-19          2012-04-25                       48407
# Calculate the mean casual bike rentals during festivals from 2011-2012
average_casual_rentals_during_festivals <- merged_data %>%
  filter(date >= festival_start_date & date <= festival_end_date) %>%
  summarise(average_casual_rentals_during_festival = mean(casual, na.rm = TRUE))

# Output average casual rentals during festivals
average_casual_rentals_during_festivals
## # A tibble: 1 × 1
##   average_casual_rentals_during_festival
##                                    <dbl>
## 1                                  1056.
# Create an identifier for the festival periods
cherry_blossoms <- cherry_blossoms %>%
  mutate(festival_id = row_number())

# Merge with day dataset to exclude the periods of the festival
day_with_festival <- day %>%
  left_join(cherry_blossoms, by = "Year") %>%
  mutate(is_festival = date >= festival_start_date & date <= festival_end_date,
         is_festival = ifelse(is.na(is_festival), FALSE, is_festival))

# Calculate the mean casual bike rentals excluding festival days from 2011-2012
average_casual_rentals_excluding_festival <- day_with_festival %>%
  filter(!is_festival) %>%
  summarise(average_casual_rentals_excluding_festival = mean(casual, na.rm = TRUE))

# Output average casual rentals excluding festival days
average_casual_rentals_excluding_festival
## # A tibble: 1 × 1
##   average_casual_rentals_excluding_festival
##                                       <dbl>
## 1                                      832.
library(dplyr)
library(readr)
library(ggplot2)

# Load the cherry blossoms dataset
cherry_blossoms <- read_csv("cherry-blossoms_filtered_2011-2013.csv", show_col_types = FALSE) %>%
  rename(
    peak_bloom = "Yoshino Peak Bloom Date",
    festival_start_date = "Festival Start Date",
    festival_duration = "Festival Duration"
  ) %>%
  select(Year, festival_start_date, festival_duration) %>%
  mutate(
    Year = as.numeric(Year),
    festival_start_date = as.Date(festival_start_date - 1, origin=paste0(Year - 1, "-12-31")),
    festival_end_date = festival_start_date + as.numeric(festival_duration) - 1
  )

# Load the bike rental data
day <- read_csv("day.csv") %>%
  mutate(
    date = as.Date(dteday),
    Year = year(date)
  )
## Rows: 731 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (15): instant, season, yr, mnth, holiday, weekday, workingday, weathers...
## date  (1): dteday
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Merge the datasets
merged_data <- left_join(day, cherry_blossoms, by = "Year") %>%
  mutate(
    period = case_when(
      date >= festival_start_date & date <= festival_end_date ~ "During Festival",
      date >= (festival_start_date - 30) & date < festival_start_date ~ "Before Festival",
      date > festival_end_date & date <= (festival_end_date + 30) ~ "After Festival",
      TRUE ~ "Non-Festival"
    )
  ) %>%
  filter(Year %in% c(2011, 2012)) # Filter for the years 2011 and 2012

# Filter data to include only the range around festival dates
filtered_data <- merged_data %>%
  group_by(Year) %>%
  mutate(
    start_min = min(festival_start_date - 30),
    end_max = max(festival_end_date + 30)
  ) %>%
  ungroup() %>%
  filter(date >= start_min & date <= end_max)

# Aggregate total rentals by date and period
daily_rentals <- filtered_data %>%
  group_by(date, period, Year) %>%
  summarize(total_rentals = sum(cnt), .groups = 'drop')

# Plot the data with facets
p <- ggplot(daily_rentals, aes(x = date, y = total_rentals, fill = period)) +
  geom_col(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("Before Festival" = "blue", "During Festival" = "red", "After Festival" = "green", "Non-Festival" = "grey")) +
  labs(title = "Daily Bike Rentals During and Around Cherry Blossom Festival Periods",
       x = "Date", y = "Total Bike Rentals") +
  facet_wrap(~ Year, scales = "free_x") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in geom_col(stat = "identity", position = "dodge"): Ignoring unknown
## parameters: `stat`
# Print the plot
print(p)

Upon analyzing the average rentals during festival and excluding festival in the years of 2011 and 2012, we calculated that there is an 8.42% change. This means that during cherry blossom festival duration there is an increased amount of bike rentals compared to normal times of the year. Another finding is that in 2012 there is way more total bike rentals compared to 2011 during the days of cherry blossom festival. This can be explained by looking into the weather situation which was investigated earlier. In March of 2012 the temperature was 10 degrees Celsius higher compared to previous years, so normally it would be in the 50s but in March of 2012 it was in the 70s reaching the 80s. This could mean that warmer weather may have explained the drastic increase in total bike rentals in 2012 compared to 2011.

library(readr)
library(tidyverse)
day <- read_csv("hour.csv")
## Rows: 17379 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (16): instant, season, yr, mnth, hr, holiday, weekday, workingday, weat...
## date  (1): dteday
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(day)
##  [1] "instant"    "dteday"     "season"     "yr"         "mnth"      
##  [6] "hr"         "holiday"    "weekday"    "workingday" "weathersit"
## [11] "temp"       "atemp"      "hum"        "windspeed"  "casual"    
## [16] "registered" "cnt"
dayGrouped <- day %>%
  group_by(season) %>%
  summarize(
    Casualusers = sum(casual),
    Registeredusers = sum(registered),
    AvgTemp = round(mean(temp) *100, 2),
    Users = sum(casual) + sum(registered)
  )
dayGrouped_long <- dayGrouped %>%
  pivot_longer(cols = c(Casualusers, Registeredusers), names_to = "UserType", values_to = "Count")

ggplot(dayGrouped_long, aes(x = season, y = Count, fill = UserType)) +
  geom_col(position = "stack") +
  theme_minimal() +
  coord_cartesian(ylim = c(300000, 1150000)) +
  scale_fill_manual(values = c("Casualusers" = "lightblue", "Registeredusers" = "lightgray")) +
  labs(fill = "User Type")

ggplot(dayGrouped) +
  geom_col(aes(season, Users, fill=as.factor(AvgTemp))) +
  coord_cartesian(ylim = c(300000, 1150000)) +
  theme_minimal() +
  scale_fill_brewer(palette="OrRd")

dayGroupedTwice <- day %>%
  group_by(yr, season) %>%
  summarize(
    Casualusers = sum(casual),
    Registeredusers = sum(registered),
    AvgTemp = round(mean(temp) * 100, 2),
    Users = sum(casual) + sum(registered)
  )
## `summarise()` has grouped output by 'yr'. You can override using the `.groups`
## argument.
ggplot(dayGroupedTwice) +
  geom_col(aes(season, Users, fill=as.factor(AvgTemp))) +
  coord_cartesian(ylim = c(100000, 650000)) +
  theme_minimal() +
  scale_fill_brewer(palette="OrRd") +
  facet_wrap(~yr)

ggplot(day) +
  geom_point(aes(cnt,temp,color=as.factor(weekday))) +
  theme_minimal() +
  facet_wrap(~workingday) +
  scale_color_brewer(palette = "Dark2")

data <- mutate(day, yr = ifelse(yr == 0, 2011, 2012),
               mnth = case_when(
                 mnth == 1 ~ "January",
                 mnth == 2 ~ "February",
                 mnth == 3 ~ "March",
                 mnth == 4 ~ "April",
                 mnth == 5 ~ "May",
                 mnth == 6 ~ "June",
                 mnth == 7 ~ "July",
                 mnth == 8 ~ "August",
                 mnth == 9 ~ "September",
                 mnth == 10 ~ "October",
                 mnth == 11 ~ "November",
                 mnth == 12 ~ "December"
               ))

# Convert weekday column to actual day names
data <- mutate(data, weekday = case_when(
  weekday == 0 ~ "Sunday",
  weekday == 1 ~ "Monday",
  weekday == 2 ~ "Tuesday",
  weekday == 3 ~ "Wednesday",
  weekday == 4 ~ "Thursday",
  weekday == 5 ~ "Friday",
  weekday == 6 ~ "Saturday"
))

# Convert 'dteday' column to date format
data$dteday <- as.Date(data$dteday)

# Define important event dates
important_dates <- c(
  "2011-01-20",  # Presidential Inauguration of Barack Obama
  "2011-04-09",  # Dedication of the Martin Luther King Jr. Memorial
  "2011-06-30",  # End of the "Don't Ask, Don't Tell" policy
  "2011-08-23",  # East Coast Earthquake
  "2011-10-30",  # Dedication of the Franklin Delano Roosevelt Memorial
  "2011-12-15",  # Last convoy of U.S. troops leaves Iraq
  "2012-01-21",  # Second Presidential Inauguration of Barack Obama
  "2012-05-20",  # Dedication of the Martin Luther King Jr. National Memorial Library
  "2012-06-28",  # Supreme Court upholds the Affordable Care Act
  "2012-08-06",   # International AIDS Conference
  "2011-01-18",  # Martin Luther King Jr. Day
  "2011-07-04",  # Independence Day Celebrations
  "2011-09-24",  # National Book Festival
  "2012-02-11",  # "Portraits of African American Heroes" exhibition at the Smithsonian
  "2012-11-06",  # United States Presidential Election
  "2011-05-01",  # Death of Osama bin Laden
  "2011-09-11"   # 10th Anniversary of the September 11 Attacks
)

# Create binary variable indicating whether it's an important date
data$is_important <- ifelse(data$dteday %in% as.Date(important_dates), 1, 0)

# Define cherry blossom festival dates
cherry_blossom_dates <- c(seq(from = as.Date("2011-03-26"), to = as.Date("2011-04-10"), by = "day"),
                        seq(from = as.Date("2012-03-20"), to = as.Date("2012-04-27"), by = "day"))


# Create binary variable indicating whether it's a peak blossom day
data$is_cherry_blossom <- ifelse(data$dteday %in% as.Date(cherry_blossom_dates), 1, 0)

# Create binary variable indicating whether it's both an important date and a peak blossom day
data$is_important_and_cherry <- ifelse(data$is_important == 1 & data$is_cherry_blossom == 1, 1, 0)

# Print the updated dataset
head(data)
## # A tibble: 6 × 20
##   instant dteday     season    yr mnth       hr holiday weekday  workingday
##     <dbl> <date>      <dbl> <dbl> <chr>   <dbl>   <dbl> <chr>         <dbl>
## 1       1 2011-01-01      1  2011 January     0       0 Saturday          0
## 2       2 2011-01-01      1  2011 January     1       0 Saturday          0
## 3       3 2011-01-01      1  2011 January     2       0 Saturday          0
## 4       4 2011-01-01      1  2011 January     3       0 Saturday          0
## 5       5 2011-01-01      1  2011 January     4       0 Saturday          0
## 6       6 2011-01-01      1  2011 January     5       0 Saturday          0
## # ℹ 11 more variables: weathersit <dbl>, temp <dbl>, atemp <dbl>, hum <dbl>,
## #   windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## #   is_important <dbl>, is_cherry_blossom <dbl>, is_important_and_cherry <dbl>
sorted_data <- data[order(data$casual), ]

# Select the top 5 and bottom 5 rows
top_5_least <- sorted_data[1:5, ]
bottom_5_most <- sorted_data[(nrow(sorted_data) - 4):nrow(sorted_data), ]

# Create a combined dataset for display
combined_data <- rbind(top_5_least, bottom_5_most)

# Convert 'dteday' column to Date format
combined_data$dteday <- as.Date(combined_data$dteday)

# Create a copy of the dataset with formatted date
formatted_combined_data <- combined_data
formatted_combined_data$dteday <- format(formatted_combined_data$dteday, "%Y-%m-%d")
formatted_combined_data <- formatted_combined_data %>%
  select(dteday,
         weekday,
         holiday,
         workingday,
         temp,
         atemp,
         hum,
         windspeed,
         cnt,
         is_important,
         is_cherry_blossom,
         is_important_and_cherry) %>%
  mutate(temp = round(temp * 100),
         atemp = round(atemp * 100),
         hum = round(hum * 100),
         windspeed = round(windspeed * 100))

binary_to_yes_no <- function(x) {
  ifelse(x == 0, "No", "Yes")
}

# Modify the binary variables to Yes or No
tableData <- formatted_combined_data %>%
  mutate_at(vars(is_important, is_cherry_blossom, is_important_and_cherry, holiday, workingday), binary_to_yes_no) %>%
  rename(
    Date = dteday,
    DayOfWeek = weekday,
    Holiday = holiday,
    WorkingDay = workingday,
    HeatIndex = atemp,
    Temperature = temp,
    Humidity = hum,
    WindSpeed = windspeed,
    Users = cnt,
    ImportantDate = is_important,
    CherryFestivalDay = is_cherry_blossom,
    ImportantAndPeakBlossomDay = is_important_and_cherry
  )

# Display the combined dataset with kable
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Display the combined dataset with kable and color formatting
kable(tableData, caption = "Top 5 Days with Least and Most Users", align = "c") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  column_spec(5, color = "white",
              background = spec_color(tableData$Temperature[1:10], end = 0.7)) 
Top 5 Days with Least and Most Users
Date DayOfWeek Holiday WorkingDay Temperature HeatIndex Humidity WindSpeed Users ImportantDate CherryFestivalDay ImportantAndPeakBlossomDay
2011-01-01 Saturday No No 24 29 75 0 1 No No No
2011-01-01 Saturday No No 24 26 75 9 1 No No No
2011-01-02 Sunday No No 42 42 77 30 2 No No No
2011-01-02 Sunday No No 40 41 76 19 1 No No No
2011-01-02 Sunday No No 40 41 71 22 8 No No No
2012-05-19 Saturday No No 72 65 30 10 672 No No No
2012-03-17 Saturday No No 64 62 53 13 679 No No No
2012-05-19 Saturday No No 72 65 30 9 730 No No No
2012-10-06 Saturday No No 70 65 54 10 743 No No No
2012-03-17 Saturday No No 64 62 50 0 685 No No No
ggplot(data, aes(x=atemp, y=cnt, color=as.factor(is_cherry_blossom))) +
  geom_point() +  
  theme_minimal() +
  labs(x = "Heat Index (atemp)", y = "Total Users") +
  scale_color_manual(values = c("lightblue", "red"), name = "Cherry Blossom", labels = c("Not Cherry Blossom", "Cherry Blossom"))

p1 <- ggplot(day, aes(x = temp, y = casual + registered, color = temp)) +
  geom_point() +
  labs(x = "Temperature", y = "Total Users") +
  scale_color_continuous(name = "Temperature")

# Plot for Humidity
p2 <- ggplot(day, aes(x = hum, y = casual + registered, color = hum)) +
  geom_point() +
  labs(x = "Humidity", y = "Total Users") +
  scale_color_continuous(name = "Humidity")

# Plot for WindSpeed
p3 <- ggplot(day, aes(x = windspeed, y = casual + registered, color = windspeed)) +
  geom_point() +
  labs(x = "Wind Speed", y = "Total Users") +
  scale_color_continuous(name = "Wind Speed", trans = "reverse")

p4 <- ggplot(day, aes(x = atemp, y = casual + registered, color = atemp)) +
  geom_point() +
  labs(x = "Heat Index", y = "Total Users") +
  scale_color_continuous(name = "Heat Index")

p5 <- ggplot(day, aes(x = factor(weathersit), y = casual + registered, color = factor(weathersit))) +
  geom_boxplot() +
  labs(x = "Weather Situation", y = "Total Users") +
  scale_color_discrete(name = "Weather Situation")

# Combine plots into one picture
combined_plot <- gridExtra::grid.arrange(p1, p2, p3, p4, p5, nrow = 2)

# Display the combined plot
print(combined_plot)
## TableGrob (2 x 3) "arrange": 5 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (2-2,1-1) arrange gtable[layout]
## 5 5 (2-2,2-2) arrange gtable[layout]